package au.com.acpfg.proteomics.mascotrdr; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Vector; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataColumnSpecCreator; import org.knime.core.data.DataRow; import org.knime.core.data.DataTableSpec; import org.knime.core.data.DataType; import org.knime.core.data.RowKey; import org.knime.core.data.collection.CollectionCellFactory; import org.knime.core.data.collection.ListCell; import org.knime.core.data.def.DefaultRow; import org.knime.core.data.def.DoubleCell; import org.knime.core.data.def.IntCell; import org.knime.core.data.def.StringCell; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.BufferedDataTable; import org.knime.core.node.CanceledExecutionException; import org.knime.core.node.defaultnodesettings.SettingsModel; import org.knime.core.node.defaultnodesettings.SettingsModelDoubleBounded; import org.knime.core.node.defaultnodesettings.SettingsModelIntegerBounded; import org.knime.core.node.defaultnodesettings.SettingsModelString; import org.knime.core.node.ExecutionContext; import org.knime.core.node.ExecutionMonitor; import org.knime.core.node.InvalidSettingsException; import org.knime.core.node.NodeLogger; import org.knime.core.node.NodeModel; import org.knime.core.node.NodeSettingsRO; import org.knime.core.node.NodeSettingsWO; import au.com.acpfg.misc.spectra.AbstractSpectraCell; import au.com.acpfg.misc.spectra.MGFSpectraCell; import au.com.acpfg.misc.spectra.MyMGFPeakList; import au.com.acpfg.misc.spectra.SpectraUtilityFactory; import be.proteomics.mascotdatfile.util.interfaces.FragmentIon; import be.proteomics.mascotdatfile.util.interfaces.MascotDatfileInf; import be.proteomics.mascotdatfile.util.interfaces.QueryToPeptideMapInf; import be.proteomics.mascotdatfile.util.mascot.PeptideHit; import be.proteomics.mascotdatfile.util.mascot.PeptideHitAnnotation; import be.proteomics.mascotdatfile.util.mascot.ProteinHit; import be.proteomics.mascotdatfile.util.mascot.Query; import be.proteomics.mascotdatfile.util.mascot.enumeration.MascotDatfileType; import be.proteomics.mascotdatfile.util.mascot.factory.MascotDatfileFactory; /** * This is the model implementation of MascotReader. * Using the MascotDatFile open-source java library, this node provides an interface to that, to provide convenient access to MatrixScience Mascot datasets * * @author Andrew Cassin */ public class MascotReaderNodeModel extends NodeModel { // the logger instance private static final NodeLogger logger = NodeLogger .getLogger(MascotReaderNodeModel.class); private static int N_COLS = 15; // number of output columns for the node /** the settings key which is used to retrieve and store the settings (from the dialog or from a settings file) (package visibility to be usable from the dialog). */ static final String CFGKEY_FOLDER = "folder"; static final String CFGKEY_CONFIDENCE = "confidence"; static final String CFGKEY_RESULTTYPE = "results-selection"; /** initial default count value. */ private static final String DEFAULT_FOLDER = "/tmp"; private static final double DEFAULT_CONFIDENCE = 0.05; // 95% CI private static final String DEFAULT_RESULTTYPE = "all"; // all hits for all spectra: one of ("all", "best" or "confident") // example value: the models count variable filled from the dialog // and used in the models execution method. The default components of the // dialog work with "SettingsModels". private final SettingsModelString m_folder = make_as_string(CFGKEY_FOLDER); private final SettingsModelDoubleBounded m_confidence = (SettingsModelDoubleBounded) make(CFGKEY_CONFIDENCE); private final SettingsModelString m_resulttype = make_as_string(CFGKEY_RESULTTYPE); /** * Constructor for the node model. */ protected MascotReaderNodeModel() { // one outgoing port only super(0, 1); } protected static SettingsModel make(String k) { if (k.equals(CFGKEY_FOLDER)) { return new SettingsModelString(CFGKEY_FOLDER, DEFAULT_FOLDER); } else if (k.equals(CFGKEY_CONFIDENCE)) { SettingsModel sm = new SettingsModelDoubleBounded(CFGKEY_CONFIDENCE, DEFAULT_CONFIDENCE, 0.0, 1.0); sm.setEnabled(false); // only correct since the default result type is not confident return sm; } else if (k.equals(CFGKEY_RESULTTYPE)) { return new SettingsModelString(CFGKEY_RESULTTYPE, DEFAULT_RESULTTYPE); } return null; } protected static SettingsModelString make_as_string(String k) { return (SettingsModelString) make(k); } /** * {@inheritDoc} */ @Override protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception { logger.info("MascotReader: about to load .DAT files from "+m_folder.getStringValue()); // the data table spec of the single output table, // the table will have three columns: DataColumnSpec[] allColSpecs = new DataColumnSpec[N_COLS]; allColSpecs[0] = new DataColumnSpecCreator("Peptide Sequence", StringCell.TYPE).createSpec(); allColSpecs[1] = new DataColumnSpecCreator("Modified Peptide Sequence", StringCell.TYPE).createSpec(); allColSpecs[2] = new DataColumnSpecCreator("Ion Score", DoubleCell.TYPE).createSpec(); allColSpecs[3] = new DataColumnSpecCreator("Identity Threshold", DoubleCell.TYPE).createSpec(); allColSpecs[4] = new DataColumnSpecCreator("Mass Error", DoubleCell.TYPE).createSpec(); allColSpecs[5] = new DataColumnSpecCreator("Protein Accessions", ListCell.getCollectionType(StringCell.TYPE)).createSpec(); allColSpecs[6] = new DataColumnSpecCreator("Protein Starts", ListCell.getCollectionType(IntCell.TYPE)).createSpec(); allColSpecs[7] = new DataColumnSpecCreator("Protein Ends", ListCell.getCollectionType(IntCell.TYPE)).createSpec(); allColSpecs[8] = new DataColumnSpecCreator("E-value", DoubleCell.TYPE).createSpec(); allColSpecs[9] = new DataColumnSpecCreator("Reported in", StringCell.TYPE).createSpec(); allColSpecs[10]= new DataColumnSpecCreator("Missed Cleavages", IntCell.TYPE).createSpec(); allColSpecs[11]= new DataColumnSpecCreator("Spectrum Title", StringCell.TYPE).createSpec(); allColSpecs[12]= new DataColumnSpecCreator("Matching Ions (list of ion=m/z pairs, B&Y only)", ListCell.getCollectionType(StringCell.TYPE)).createSpec(); allColSpecs[13]= new DataColumnSpecCreator("Theoretical Ions (list of ion=m/z pairs)", ListCell.getCollectionType(StringCell.TYPE)).createSpec(); allColSpecs[14]= new DataColumnSpecCreator("Spectra", AbstractSpectraCell.TYPE).createSpec(); DataTableSpec outputSpec = new DataTableSpec(allColSpecs); // the execution context will provide us with storage capacity, in this // case a data container to which we will add rows sequentially // Note, this container can also handle arbitrary big data tables, it // will buffer to disc if necessary. BufferedDataContainer container = exec.createDataContainer(outputSpec); // process all suitably named (regardless of case) files ie. dat files only File[] dat_files = new File(m_folder.getStringValue()).listFiles(); int row_id = 0; int done = 0; int bad = 0; int good = 0; int total = dat_files.length; for (File f : dat_files) { if (f.getName().toLowerCase().endsWith(".dat")) { logger.info("Processing Mascot DAT file: "+f.getName()); // if the .dat file has not been produced by mascot, we will get an exception quickly so we just catch and // count problem files here... MascotDatfileInf mascot_dat_file; QueryToPeptideMapInf q2pm; Vector good_hits; try { mascot_dat_file = MascotDatfileFactory.create(f.getAbsolutePath(), MascotDatfileType.INDEX); q2pm = mascot_dat_file.getQueryToPeptideMap(); good++; // consider the .DAT file good if we get here without throw for (int query=1; query<q2pm.getNumberOfQueries(); query++) { good_hits = q2pm.getAllPeptideHits(query); // no hits for the query? if (good_hits == null) { logger.debug("No hits available for query: "+query+" in "+f.getName()+" should be "+q2pm.getNumberOfPeptideHits(query)); } else { // only output hits according to the chosen strategy: // 1. best hit only: only element 0 (which is always the best hit) is output // 2. all hits // 3. above user-chosen confidence level for the current FILE only Query q = mascot_dat_file.getQuery(query); String title = q.getTitle(); boolean is_all = m_resulttype.getStringValue().startsWith("all"); boolean is_best = m_resulttype.getStringValue().startsWith("best"); boolean is_confidence = (!is_all && !is_best); int max = is_best ? 1 : good_hits.size(); for (int i=0; i<max; i++) { PeptideHit ph = (PeptideHit) good_hits.elementAt(i); if (is_confidence && !ph.scoresAboveIdentityThreshold(m_confidence.getDoubleValue())) { continue; } DataCell[] cells = new DataCell[N_COLS]; cells[0] = new StringCell(ph.getSequence()); cells[1] = new StringCell(ph.getModifiedSequence()); cells[2] = new DoubleCell(ph.getIonsScore()); cells[3] = new DoubleCell(ph.calculateIdentityThreshold()); cells[4] = new DoubleCell(ph.getDeltaMass()); cells[5] = CollectionCellFactory.createListCell(toProtAccsns(ph)); cells[6] = CollectionCellFactory.createListCell(toProtStart(ph)); cells[7] = CollectionCellFactory.createListCell(toProtEnd(ph)); cells[8] = new DoubleCell(ph.getExpectancy()); cells[9] = new StringCell(f.getName()); cells[10] = new IntCell(ph.getMissedCleavages()); cells[11] = new StringCell(title); cells[12] = matchingIonsCell(ph, q, mascot_dat_file); cells[13] = theoreticalIonsCell(ph, q, mascot_dat_file); cells[14] = make_spectra(q); DataRow row = new DefaultRow("Hit"+row_id, cells); container.addRowToTable(row); row_id++; } good_hits.clear(); good_hits = null; } } } catch (Exception e) { logger.warn("Cannot process "+f.getName()+" - file corrupt?"); bad++; continue; } exec.checkCanceled(); done++; exec.setProgress(((double)done) / total, "Processed "+f.getName()); // try to avoid heap space problems... q2pm = null; mascot_dat_file.finish(); mascot_dat_file = null; } else { logger.warn("Encountered non-.DAT file: "+f.getName()+ " -- ignored."); done++; } } // report overall activity if (bad > 0) { logger.warn("File summary: "+(good+bad)+" total, "+bad+" bad, "+good+" good .DAT files."); } else { logger.info("Processed "+good+" Mascot .DAT files"); } // once we are done, we close the container and return its table container.close(); BufferedDataTable out = container.getTable(); return new BufferedDataTable[]{out}; } private DataCell theoreticalIonsCell(PeptideHit ph, Query q, MascotDatfileInf mascot_dat_file) { PeptideHitAnnotation pha = new PeptideHitAnnotation(ph.getSequence(), ph.getModifications(), mascot_dat_file.getMasses(), mascot_dat_file.getParametersSection(), ph.getIonSeriesFound()); ArrayList<DataCell> theoretical_ions = new ArrayList<DataCell>(); Vector ions = pha.getAllTheoreticalFragmentions(); if (ions.size() < 1) { return DataType.getMissingCell(); } for (int i = 0; i < ions.size(); i++) { FragmentIon fm = (FragmentIon) ions.get(i); theoretical_ions.add(new StringCell(fm.getLabel()+"="+fm.getMZ())); } return CollectionCellFactory.createListCell(theoretical_ions); } private DataCell matchingIonsCell(PeptideHit ph, Query q, MascotDatfileInf mascot_dat_file) { PeptideHitAnnotation pha = new PeptideHitAnnotation(ph.getSequence(), ph.getModifications(), mascot_dat_file.getMasses(), mascot_dat_file.getParametersSection(), ph.getIonSeriesFound()); ArrayList<DataCell> matching_ions = new ArrayList<DataCell>(); Vector mascot_ions = pha.getMatchedBYions(q.getPeakList()); // BUG: only report matching B&Y ions for now if (mascot_ions.size() < 1) { return DataType.getMissingCell(); } for (int i = 0; i < mascot_ions.size(); i++) { FragmentIon fm = (FragmentIon) mascot_ions.get(i); matching_ions.add(new StringCell(fm.getLabel()+"="+fm.getMZ())); } return CollectionCellFactory.createListCell(matching_ions); } protected Collection<StringCell> toProtAccsns(PeptideHit ph) { ArrayList<StringCell> al = new ArrayList<StringCell>(); for (Object o : ph.getProteinHits()) { ProteinHit prothit = (ProteinHit) o; al.add(new StringCell(prothit.getAccession())); } return al; } protected Collection<IntCell> toProtStartEnd(PeptideHit ph, boolean want_start) { ArrayList<IntCell> al = new ArrayList<IntCell>(); for (Object o : ph.getProteinHits()) { ProteinHit prothit = (ProteinHit) o; al.add(new IntCell(want_start ? prothit.getStart() : prothit.getStop())); } return al; } protected Collection<IntCell> toProtStart(PeptideHit ph) { return toProtStartEnd(ph, true); } protected Collection<IntCell> toProtEnd(PeptideHit ph) { return toProtStartEnd(ph, false); } /** * {@inheritDoc} */ @Override protected void reset() { } /** * {@inheritDoc} */ @Override protected DataTableSpec[] configure(final DataTableSpec[] inSpecs) throws InvalidSettingsException { return new DataTableSpec[]{null}; } /** * {@inheritDoc} */ @Override protected void saveSettingsTo(final NodeSettingsWO settings) { m_folder.saveSettingsTo(settings); m_confidence.saveSettingsTo(settings); m_resulttype.saveSettingsTo(settings); } /** * {@inheritDoc} */ @Override protected void loadValidatedSettingsFrom(final NodeSettingsRO settings) throws InvalidSettingsException { m_folder.loadSettingsFrom(settings); m_confidence.loadSettingsFrom(settings); m_resulttype.loadSettingsFrom(settings); } /** * {@inheritDoc} */ @Override protected void validateSettings(final NodeSettingsRO settings) throws InvalidSettingsException { m_folder.validateSettings(settings); m_confidence.validateSettings(settings); m_resulttype.validateSettings(settings); } /** * {@inheritDoc} */ @Override protected void loadInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { } /** * {@inheritDoc} */ @Override protected void saveInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { } protected DataCell make_spectra(Query q) { if (q == null || q.getNumberOfPeaks() < 1) return DataType.getMissingCell(); MyMGFPeakList mgf = new MyMGFPeakList(); mgf.setCharge(q.getChargeString()); mgf.setTitle(q.getTitle()); mgf.setPeaks(q.getMZArray(), q.getIntensityArray()); mgf.setPepMass(new Double(q.getPrecursorMZ()).toString()); return SpectraUtilityFactory.createCell(mgf); } }